library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4 ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(WGCNA)
Loading required package: dynamicTreeCut
Loading required package: fastcluster
Attaching package: ‘fastcluster’
The following object is masked from ‘package:stats’:
hclust
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'rmarkdown':
method from
print.paged_df
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘WGCNA’
The following object is masked from ‘package:stats’:
cor
library(gplots)
Attaching package: ‘gplots’
The following object is masked from ‘package:stats’:
lowess
options(stringsAsFactors = FALSE)
load sample info
sample.description <- read.csv("../input/sample.description.csv")
load reads
lcpm <- read.csv("../output/log2cpm.csv.gz", row.names = 1, check.names = FALSE)
head(lcpm)
dim(lcpm)
[1] 26704 48
Filter for sporophyte samples:
sample.description <- sample.description %>% filter(str_detect(tissue, "S5|WYS"))
lcpm <- lcpm[,sample.description$sample]
Filter for genes with the highest coefficient of variation
CV <- apply(lcpm, 1, \(x) abs(sd(x)/mean(x)))
hist(log10(CV))
names(CV) <- rownames(lcpm)
CV[str_detect(names(CV), "18G076300|33G031700")]
Ceric.18G076300.v2.1 Ceric.33G031700.v2.1
0.08763174 0.18394629
quantile(CV, 0.23)
23%
0.08608846
lcpm.filter <- lcpm[CV > quantile(CV, 0.23),]
dim(lcpm.filter)
[1] 20562 30
WGCNA wants genes in columns
lcpm.filter.t <- t(lcpm.filter)
Soft thresholding
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(lcpm.filter.t, powerVector = powers, verbose = 5,networkType = "signed hybrid", blockSize = 15000)
pickSoftThreshold: calculating connectivity for given powers...
..working on genes 1 through 15000 of 20562
Warning: executing %dopar% sequentially: no parallel backend registered
..working on genes 15001 through 20562 of 20562
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 <- 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
choose 8
softPower <- 8
adjacency <- adjacency(lcpm.filter.t, power = softPower, type = "signed hybrid")
# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency, TOMType = "signed");
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
dissTOM <- 1-TOM
# Call the hierarchical clustering function
geneTree <- hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
define modules
# We like large modules, so we set the minimum module size relatively high:
minModuleSize <- 30;
# Module identification using dynamic tree cut:
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit <- 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
..done.
table(dynamicMods)
dynamicMods
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
2597 1248 1139 890 791 778 659 653 508 498 482 459 436 428 421 408 404 380 376 367
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
350 336 314 312 308 306 283 273 251 246 243 236 226 198 198 190 184 174 172 170
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
145 143 142 121 118 113 110 107 98 94 93 87 87 77 74 61
# Convert numeric labels into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
dynamicColors
bisque4 black blue brown brown4 cyan
110 659 1248 1139 113 428
darkgreen darkgrey darkmagenta darkolivegreen darkorange darkorange2
336 312 198 226 306 118
darkred darkslateblue darkturquoise floralwhite green greenyellow
350 107 314 121 791 482
grey60 ivory lightcyan lightcyan1 lightgreen lightpink4
404 142 408 143 380 61
lightsteelblue1 lightyellow magenta maroon mediumpurple3 midnightblue
145 376 508 74 170 421
navajowhite2 orange orangered4 paleturquoise palevioletred3 pink
77 308 172 243 87 653
plum1 plum2 purple red royalblue saddlebrown
174 98 498 778 367 251
salmon salmon4 sienna3 skyblue skyblue3 steelblue
436 87 198 273 184 246
tan thistle1 thistle2 turquoise violet white
459 93 94 2597 236 283
yellow yellowgreen
890 190
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
merge similar modules
# Calculate eigengenes
MEList <- moduleEigengenes(lcpm.filter.t, colors = dynamicColors)
MEs <- MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss <- 1-cor(MEs);
# Cluster module eigengenes
METree <- hclust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
merge with correlation > 0.75
MEDissThres = 0.25
# Plot the cut line into the dendrogram
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(lcpm.filter.t, dynamicColors, cutHeight = MEDissThres, verbose = 3)
mergeCloseModules: Merging modules whose distance is less than 0.25
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 56 module eigengenes in given set.
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 31 module eigengenes in given set.
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 26 module eigengenes in given set.
Calculating new MEs...
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 26 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
compare pre and post merge
sizeGrWindow(12, 9)
#pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
#dev.off()
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs
table(merge$colors)
black blue brown cyan darkgreen darkgrey
659 2521 3736 428 423 775
darkorange2 darkred darkslateblue green greenyellow grey60
118 730 107 1250 482 825
ivory lightcyan lightcyan1 lightsteelblue1 magenta maroon
1032 1439 1381 145 912 261
mediumpurple3 orangered4 plum2 saddlebrown salmon4 skyblue
170 2120 98 251 87 273
steelblue thistle1
246 93
length(table(merge$colors))
[1] 26
median(table(merge$colors))
[1] 455
Which module is LFY in?
CrLFY1 <- "Ceric.33G031700.2.v2.1" # this is the primary transcript. There is another but it is expressed at very low levels.
CrLFY2 <- "Ceric.18G076300"
module.assignment <- tibble(geneID=colnames(lcpm.filter.t), module = mergedColors)
module.assignment %>%
filter(str_detect(geneID, "18G076300|33G031700"))
Interesting: they are in different modules(!). But both are quite large…
module.assignment %>% group_by(module) %>% summarize(n_genes = n()) %>% arrange(n_genes)
Plot eigengenes
Make sure sample info sheet is in the correct order.
rownames(lcpm.filter.t) %>% str_replace_all("\\.", "-") == sample.description$sample
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[20] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
sample.eigen <- cbind(sample.description, MEs)
sample.eigen
sample.eigen.l <- sample.eigen %>%
mutate(gt_tissue=str_c(base_gt, "-", tissue)) %>%
pivot_longer(starts_with("ME"), names_to = "ME")
sample.eigen.means <- sample.eigen.l %>%
group_by(gt_tissue, ME) %>%
summarise(value = mean(value))
`summarise()` has grouped output by 'gt_tissue'. You can override using the `.groups` argument.
sample.eigen.l %>%
ggplot(aes(x=gt_tissue, y = value)) +
geom_point(aes(color = tissue)) +
geom_line(data=sample.eigen.means, group = 1, lwd=.3) +
facet_wrap(~ME, ncol=5) +
theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) +
scale_color_brewer(type="qual", palette = "Set3")
A heat map:
MEs.m <- as.matrix(MEs)
heatmap.2(MEs.m, trace="none", cexRow= 0.6, col="bluered")
save(module.assignment, MEs, lcpm.filter, CrLFY1, CrLFY2, file="../output/WGCNA_sporophyteSamples.Rdata")